import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import statistics as stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, RandomizedSearchCV, RepeatedStratifiedKFold, StratifiedKFold, ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, roc_auc_score, balanced_accuracy_score, recall_score, confusion_matrix, SCORERS
from sklearn.utils import class_weight
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.feature_selection import SelectKBest
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from sklearn.feature_selection import chi2
#import xgboost as xgb
import sweetviz as sv
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier, Pool
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
Row data: consist of train and test data.Row data does not have the information of the labels.Due to privacy concerns, the name of the features are masked except the first four columns. These are:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
data=train.copy()
Pandas Profiling: Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis. pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.
ProfileReport(data,minimal=True)
Summarize dataset: 100%|██████████| 69/69 [00:00<00:00, 166.75it/s, Completed] Generate report structure: 100%|██████████| 1/1 [00:16<00:00, 16.28s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.60s/it]
As we can see in the above report, there is no missing data in any of the columns.
data.head()
| loan_application_id | loan_amount | default | customer_age | Var_1 | Var_2 | Var_3 | Var_4 | Var_5 | Var_6 | ... | Var_50 | Var_51 | Var_52 | Var_53 | Var_54 | Var_55 | Var_56 | Var_57 | Var_58 | Var_59 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2855.0 | 0 | 25 | 3 | 1 | 1 | 37 | 41 | 5 | ... | 0 | 0 | 0 | N | 0.0 | 0 | 0 | 1 | 1 | 0 |
| 1 | 2 | 1577.0 | 0 | 53 | 0 | 0 | 0 | 0 | 26 | 2 | ... | 0 | 0 | 0 | N | 0.0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 3 | 6858.0 | 0 | 39 | 4 | 2 | 3 | 55 | 58 | 10 | ... | 700 | 0 | 700 | N | 700.0 | 0 | 0 | 4 | 6 | 0 |
| 3 | 4 | 2434.5 | 0 | 49 | 3 | 1 | 3 | 98 | 98 | 11 | ... | 16350 | 127 | 16935 | N | 8467.5 | 0 | 0 | 0 | 5 | 0 |
| 4 | 5 | 4964.0 | 0 | 47 | 2 | 1 | 1 | 110 | 124 | 15 | ... | 3000 | 142 | 3400 | N | 1700.0 | 0 | 0 | 2 | 10 | 0 |
5 rows × 63 columns
As we can see in the above report, VAr_39 and Var_53 consists of boolean values.That's why I applied the encoding process.
data=pd.get_dummies(data, columns=['Var_39','Var_53'])
data.columns
Index(['loan_application_id', 'loan_amount', 'default', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59',
'Var_39_N', 'Var_39_Y', 'Var_53_N', 'Var_53_Y'],
dtype='object')
y = data["default"]
X = data[['loan_amount', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59']]
Prepocessing data:
We used MinMaxScaler to scale the data between the range (0,1) like our categorical variables.
scaler = MinMaxScaler()
X_scaled = X[:]
X_scaled = scaler.fit_transform(X)
X[['loan_amount', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59']]=X_scaled.copy()
X_scaled=X.copy()
Feature Selection
Feature selection is the process of reducing the number of input variables when developing a predictive model. It is desirable to reduce the number of input variables to both reduce the computational cost of modeling and, in some cases, to improve the performance of the model. To select the best features, we implemented two different methods, which are "KBest" and "Recursive Feature Elimination".
Select K Best (chi2 score func)
This method selects features according to the k highest scores by using Chi-squared scoring. Here, k is the number of features you want to select. It creates the object for SelectKBest and fit and transform the classification data.
bestfeatures = SelectKBest(score_func=chi2, k="all")
fit = bestfeatures.fit(X_scaled,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_scaled.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
featureScores = featureScores.sort_values(by="Score",ascending=False)
featureScores
| Specs | Score | |
|---|---|---|
| 4 | Var_3 | 11.653724 |
| 3 | Var_2 | 9.054267 |
| 28 | Var_27 | 5.464084 |
| 5 | Var_4 | 3.068150 |
| 47 | Var_47 | 3.027410 |
| 13 | Var_12 | 2.758794 |
| 14 | Var_13 | 2.706323 |
| 1 | customer_age | 2.627567 |
| 6 | Var_5 | 2.448661 |
| 0 | loan_amount | 2.302295 |
| 9 | Var_8 | 2.024877 |
| 23 | Var_22 | 1.497434 |
| 34 | Var_33 | 1.491243 |
| 27 | Var_26 | 1.293709 |
| 26 | Var_25 | 1.094182 |
| 52 | Var_52 | 1.075771 |
| 50 | Var_50 | 1.041325 |
| 15 | Var_14 | 0.860144 |
| 20 | Var_19 | 0.855907 |
| 42 | Var_42 | 0.808981 |
| 11 | Var_10 | 0.765859 |
| 22 | Var_21 | 0.765859 |
| 21 | Var_20 | 0.686354 |
| 43 | Var_43 | 0.594642 |
| 38 | Var_37 | 0.586909 |
| 37 | Var_36 | 0.569432 |
| 10 | Var_9 | 0.543124 |
| 19 | Var_18 | 0.543124 |
| 17 | Var_16 | 0.540499 |
| 53 | Var_54 | 0.463875 |
| 30 | Var_29 | 0.383309 |
| 2 | Var_1 | 0.351560 |
| 8 | Var_7 | 0.345531 |
| 25 | Var_24 | 0.324515 |
| 7 | Var_6 | 0.321886 |
| 24 | Var_23 | 0.314007 |
| 46 | Var_46 | 0.276398 |
| 32 | Var_31 | 0.248657 |
| 16 | Var_15 | 0.197337 |
| 29 | Var_28 | 0.140179 |
| 56 | Var_57 | 0.116951 |
| 18 | Var_17 | 0.115056 |
| 48 | Var_48 | 0.113402 |
| 58 | Var_59 | 0.108052 |
| 45 | Var_45 | 0.103393 |
| 51 | Var_51 | 0.092864 |
| 41 | Var_41 | 0.047730 |
| 49 | Var_49 | 0.034064 |
| 44 | Var_44 | 0.031671 |
| 33 | Var_32 | 0.025675 |
| 40 | Var_40 | 0.018051 |
| 31 | Var_30 | 0.013852 |
| 35 | Var_34 | 0.011481 |
| 57 | Var_58 | 0.006080 |
| 39 | Var_38 | 0.005995 |
| 55 | Var_56 | 0.004766 |
| 36 | Var_35 | 0.002172 |
| 12 | Var_11 | 0.000177 |
| 54 | Var_55 | 0.000013 |
There is a drastic change in the importance of some features. This is interesting. The scaled feature importance results are preferred.
Let's drop n least important features from our data:
n = 20 # The number of features to drop
features_to_drop = list(featureScores["Specs"].iloc[-n:].values)
X_selected20 = X.drop(features_to_drop,1)
n1 = 30 # The number of features to drop
features_to_drop1 = list(featureScores["Specs"].iloc[-n1:].values)
X_selected30 = X.drop(features_to_drop1,1)
n2 = 35 # The number of features to drop
features_to_drop2 = list(featureScores["Specs"].iloc[-n2:].values)
X_selected35 = X.drop(features_to_drop2,1)
Recursive Feature Elimination
The Recursive Feature Elimination (RFE) works by recursively removing attributes and building a model on those attributes that remain.
It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.
The example below uses RFE with the random forest algorithm to select the top 30 features. The choice of algorithm does not matter too much as long as it is skillful and consistent.
estimator = RandomForestClassifier()
selector = RFE(estimator, n_features_to_select=30, step=1)
selector = selector.fit(X, y)
features_to_keep = list(X.columns[selector.support_])
X_selected_rf = X[features_to_keep]
And then, RFE with logistic regression and gradient boosting are applied, respectively.
estimator = LogisticRegression(class_weight='balanced')
selector = RFE(estimator, n_features_to_select=30, step=1)
selector = selector.fit(X, y)
features_to_keep1 = list(X.columns[selector.support_])
X_selected_lr = X[features_to_keep1]
estimator = GradientBoostingClassifier()
selector = RFE(estimator, n_features_to_select=30, step=1)
selector = selector.fit(X, y)
features_to_keep2 = list(X.columns[selector.support_])
X_selected_gb = X[features_to_keep2]
Evalution Functions
We may want to fit the models with balanced sample weight as this problem has a class imbalance problem (3:1).
That's why an option to use sample weight when we fit the models is given in our get_score function.
It returns the scores acquired from 5 fold stratified cross validation as a dataframe.
def custom_score(y_true, y_pred):
return (balanced_accuracy_score(y_true, y_pred.round()) + roc_auc_score(y_true,y_pred))/2
def get_score(model, X, y, cv = StratifiedKFold(n_splits=5, shuffle=True), sample_weight = False):
if sample_weight:
scores = cross_validate(model, X, y, cv=cv,
scoring = ["roc_auc","balanced_accuracy"],
return_train_score=True,
fit_params={'sample_weight': class_weight.compute_sample_weight(class_weight='balanced', y=y)})
scores = pd.DataFrame(scores)
scores["test ort"] = (scores["test_roc_auc"] + scores["test_balanced_accuracy"])/2
scores["train ort"] = (scores["train_roc_auc"] + scores["train_balanced_accuracy"])/2
scores.loc['Mean'] = scores.mean()
scores.loc['SD'] = scores.std()
return scores
else:
scores = cross_validate(model, X, y, cv=cv,
scoring = ["roc_auc","balanced_accuracy"],
return_train_score=True)
scores = pd.DataFrame(scores)
scores["test ort"] = (scores["test_roc_auc"] + scores["test_balanced_accuracy"])/2
scores["train ort"] = (scores["train_roc_auc"] + scores["train_balanced_accuracy"])/2
scores.loc['Mean'] = scores.mean()
scores.loc['SD'] = scores.std()
return scores
get_score(LogisticRegression(), X, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.022141 | 0.006358 | 0.693413 | 0.823280 | 0.511470 | 0.512872 | 0.602441 | 0.668076 |
| 1 | 0.013256 | 0.006761 | 0.805445 | 0.795851 | 0.498382 | 0.509654 | 0.651913 | 0.652752 |
| 2 | 0.014423 | 0.004652 | 0.802113 | 0.806479 | 0.529412 | 0.508844 | 0.665762 | 0.657662 |
| 3 | 0.012447 | 0.004544 | 0.755288 | 0.804309 | 0.498377 | 0.513789 | 0.626832 | 0.659049 |
| 4 | 0.011559 | 0.004563 | 0.793785 | 0.806289 | 0.496753 | 0.517034 | 0.645269 | 0.661661 |
| Mean | 0.014765 | 0.005376 | 0.770009 | 0.807241 | 0.506879 | 0.512438 | 0.638444 | 0.659840 |
| SD | 0.003807 | 0.000976 | 0.042274 | 0.008912 | 0.012456 | 0.002960 | 0.021933 | 0.005035 |
get_score(LogisticRegression(), X_selected20, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.025355 | 0.008995 | 0.743290 | 0.801012 | 0.498382 | 0.509249 | 0.620836 | 0.655131 |
| 1 | 0.011930 | 0.006022 | 0.796973 | 0.798969 | 0.508233 | 0.510464 | 0.652603 | 0.654717 |
| 2 | 0.014642 | 0.004307 | 0.694555 | 0.813673 | 0.498382 | 0.516495 | 0.596469 | 0.665084 |
| 3 | 0.012047 | 0.004469 | 0.851948 | 0.779651 | 0.514286 | 0.502840 | 0.683117 | 0.641245 |
| 4 | 0.012107 | 0.005443 | 0.750371 | 0.798546 | 0.500000 | 0.506085 | 0.625186 | 0.652315 |
| Mean | 0.015216 | 0.005847 | 0.767427 | 0.798370 | 0.503857 | 0.509027 | 0.635642 | 0.653698 |
| SD | 0.005170 | 0.001696 | 0.053293 | 0.010881 | 0.006368 | 0.004579 | 0.029681 | 0.007612 |
get_score(LogisticRegression(), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015867 | 0.004959 | 0.798877 | 0.783619 | 0.498382 | 0.509249 | 0.648629 | 0.646434 |
| 1 | 0.012820 | 0.005823 | 0.829526 | 0.774581 | 0.513088 | 0.502408 | 0.671307 | 0.638494 |
| 2 | 0.019575 | 0.005558 | 0.720445 | 0.794559 | 0.498382 | 0.512872 | 0.609414 | 0.653715 |
| 3 | 0.011807 | 0.005121 | 0.743785 | 0.792234 | 0.496753 | 0.509734 | 0.620269 | 0.650984 |
| 4 | 0.011989 | 0.004725 | 0.745918 | 0.793445 | 0.514286 | 0.508925 | 0.630102 | 0.651185 |
| Mean | 0.014412 | 0.005237 | 0.767710 | 0.787688 | 0.504178 | 0.508637 | 0.635944 | 0.648163 |
| SD | 0.002965 | 0.000400 | 0.040184 | 0.007607 | 0.007796 | 0.003418 | 0.021887 | 0.005374 |
get_score(LogisticRegression(), X_selected35, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.013813 | 0.004592 | 0.756996 | 0.790002 | 0.500000 | 0.508844 | 0.628498 | 0.649423 |
| 1 | 0.011075 | 0.004812 | 0.705406 | 0.798534 | 0.496764 | 0.512872 | 0.601085 | 0.655703 |
| 2 | 0.010963 | 0.005751 | 0.765182 | 0.785621 | 0.496764 | 0.509654 | 0.630973 | 0.647638 |
| 3 | 0.012270 | 0.005471 | 0.786456 | 0.778167 | 0.511039 | 0.498381 | 0.648748 | 0.638274 |
| 4 | 0.012291 | 0.004094 | 0.784787 | 0.774763 | 0.500000 | 0.502030 | 0.642393 | 0.638397 |
| Mean | 0.012082 | 0.004944 | 0.759765 | 0.785417 | 0.500913 | 0.506356 | 0.630339 | 0.645887 |
| SD | 0.001034 | 0.000599 | 0.029429 | 0.008476 | 0.005266 | 0.005326 | 0.016394 | 0.006723 |
get_score(LogisticRegression(), X_selected_rf, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015281 | 0.005285 | 0.751095 | 0.789450 | 0.500000 | 0.513277 | 0.625547 | 0.651364 |
| 1 | 0.012182 | 0.005587 | 0.818009 | 0.780001 | 0.500000 | 0.509654 | 0.659004 | 0.644828 |
| 2 | 0.013092 | 0.005536 | 0.727299 | 0.794576 | 0.498382 | 0.509249 | 0.612840 | 0.651913 |
| 3 | 0.011836 | 0.004350 | 0.749814 | 0.790821 | 0.511039 | 0.506490 | 0.630427 | 0.648655 |
| 4 | 0.011355 | 0.004792 | 0.752597 | 0.785857 | 0.498377 | 0.509734 | 0.625487 | 0.647795 |
| Mean | 0.012749 | 0.005110 | 0.759763 | 0.788141 | 0.501559 | 0.509681 | 0.630661 | 0.648911 |
| SD | 0.001387 | 0.000473 | 0.030568 | 0.004935 | 0.004795 | 0.002159 | 0.015323 | 0.002569 |
get_score(LogisticRegression(), X_selected_lr, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015475 | 0.006145 | 0.752427 | 0.813785 | 0.509852 | 0.517306 | 0.631139 | 0.665545 |
| 1 | 0.011950 | 0.004769 | 0.843042 | 0.795980 | 0.544118 | 0.505220 | 0.693580 | 0.650600 |
| 2 | 0.012651 | 0.005700 | 0.758519 | 0.815681 | 0.514706 | 0.508844 | 0.636612 | 0.662262 |
| 3 | 0.011612 | 0.004788 | 0.771614 | 0.816756 | 0.495130 | 0.513384 | 0.633372 | 0.665070 |
| 4 | 0.011964 | 0.004246 | 0.773006 | 0.808907 | 0.500000 | 0.509329 | 0.636503 | 0.659118 |
| Mean | 0.012730 | 0.005130 | 0.779722 | 0.810222 | 0.512761 | 0.510817 | 0.646241 | 0.660519 |
| SD | 0.001413 | 0.000691 | 0.032603 | 0.007613 | 0.017142 | 0.004150 | 0.023758 | 0.005465 |
get_score(LogisticRegression(), X_selected_gb, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015040 | 0.005311 | 0.774319 | 0.798534 | 0.500000 | 0.509249 | 0.637160 | 0.653892 |
| 1 | 0.012560 | 0.004912 | 0.752427 | 0.802005 | 0.498382 | 0.513682 | 0.625405 | 0.657844 |
| 2 | 0.014461 | 0.004360 | 0.750714 | 0.805522 | 0.514706 | 0.506031 | 0.632710 | 0.655777 |
| 3 | 0.010369 | 0.005888 | 0.742950 | 0.801708 | 0.500000 | 0.509734 | 0.621475 | 0.655721 |
| 4 | 0.013674 | 0.004684 | 0.801577 | 0.790721 | 0.525325 | 0.506490 | 0.663451 | 0.648605 |
| Mean | 0.013221 | 0.005031 | 0.764397 | 0.799698 | 0.507682 | 0.509037 | 0.636040 | 0.654368 |
| SD | 0.001651 | 0.000529 | 0.021313 | 0.005005 | 0.010631 | 0.002744 | 0.014758 | 0.003141 |
get_score(LogisticRegression(), X_selected20, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.037025 | 0.007511 | 0.793737 | 0.806515 | 0.725395 | 0.744245 | 0.759566 | 0.775380 |
| 1 | 0.021115 | 0.007456 | 0.800114 | 0.803303 | 0.728488 | 0.731731 | 0.764301 | 0.767517 |
| 2 | 0.015622 | 0.006264 | 0.698839 | 0.825876 | 0.624072 | 0.749583 | 0.661455 | 0.787729 |
| 3 | 0.012183 | 0.004833 | 0.820315 | 0.800520 | 0.735390 | 0.750430 | 0.777853 | 0.775475 |
| 4 | 0.014120 | 0.005618 | 0.787477 | 0.803789 | 0.729870 | 0.739487 | 0.758673 | 0.771638 |
| Mean | 0.020013 | 0.006336 | 0.780096 | 0.808000 | 0.708643 | 0.743095 | 0.744370 | 0.775548 |
| SD | 0.009010 | 0.001041 | 0.042099 | 0.009138 | 0.042409 | 0.006919 | 0.042021 | 0.006756 |
get_score(LogisticRegression(), X_selected30, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.021351 | 0.006661 | 0.775176 | 0.794794 | 0.730249 | 0.711231 | 0.752713 | 0.753012 |
| 1 | 0.017161 | 0.007531 | 0.742052 | 0.802116 | 0.642014 | 0.731755 | 0.692033 | 0.766936 |
| 2 | 0.019718 | 0.005733 | 0.789168 | 0.790231 | 0.744812 | 0.732495 | 0.766990 | 0.761363 |
| 3 | 0.012305 | 0.004493 | 0.760111 | 0.799456 | 0.687013 | 0.739487 | 0.723562 | 0.769472 |
| 4 | 0.013095 | 0.005861 | 0.780427 | 0.796253 | 0.708117 | 0.730158 | 0.744272 | 0.763205 |
| Mean | 0.016726 | 0.006056 | 0.769387 | 0.796570 | 0.702441 | 0.729025 | 0.735914 | 0.762798 |
| SD | 0.003557 | 0.001013 | 0.016613 | 0.004063 | 0.036029 | 0.009453 | 0.026061 | 0.005650 |
get_score(LogisticRegression(), X_selected35, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.017731 | 0.005181 | 0.764325 | 0.798975 | 0.721873 | 0.718859 | 0.743099 | 0.758917 |
| 1 | 0.013395 | 0.004700 | 0.820008 | 0.781640 | 0.759804 | 0.712758 | 0.789906 | 0.747199 |
| 2 | 0.013303 | 0.005917 | 0.775081 | 0.793408 | 0.687607 | 0.728513 | 0.731344 | 0.760961 |
| 3 | 0.013022 | 0.004641 | 0.701113 | 0.807217 | 0.650000 | 0.733402 | 0.675557 | 0.770309 |
| 4 | 0.008727 | 0.005566 | 0.777180 | 0.792452 | 0.662013 | 0.723674 | 0.719596 | 0.758063 |
| Mean | 0.013236 | 0.005201 | 0.767541 | 0.794738 | 0.696259 | 0.723441 | 0.731900 | 0.759090 |
| SD | 0.002850 | 0.000492 | 0.038277 | 0.008393 | 0.040203 | 0.007213 | 0.036923 | 0.007374 |
get_score(LogisticRegression(), X_selected_rf, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015129 | 0.005139 | 0.839140 | 0.784089 | 0.757757 | 0.709158 | 0.798449 | 0.746623 |
| 1 | 0.014002 | 0.004161 | 0.764040 | 0.798640 | 0.722302 | 0.725272 | 0.743171 | 0.761956 |
| 2 | 0.017361 | 0.004163 | 0.766990 | 0.797325 | 0.643632 | 0.736952 | 0.705311 | 0.767138 |
| 3 | 0.010500 | 0.004510 | 0.749907 | 0.801478 | 0.720130 | 0.739082 | 0.735019 | 0.770280 |
| 4 | 0.011316 | 0.004223 | 0.712616 | 0.811437 | 0.652273 | 0.750442 | 0.682444 | 0.780939 |
| Mean | 0.013662 | 0.004439 | 0.766539 | 0.798593 | 0.699219 | 0.732181 | 0.732879 | 0.765387 |
| SD | 0.002508 | 0.000373 | 0.041138 | 0.008778 | 0.044024 | 0.014012 | 0.039277 | 0.011249 |
get_score(LogisticRegression(), X_selected_lr, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.040669 | 0.006250 | 0.784123 | 0.819240 | 0.715401 | 0.740264 | 0.749762 | 0.779752 |
| 1 | 0.014935 | 0.006501 | 0.761184 | 0.820532 | 0.654816 | 0.766436 | 0.708000 | 0.793484 |
| 2 | 0.019597 | 0.004634 | 0.798496 | 0.816445 | 0.736722 | 0.751539 | 0.767609 | 0.783992 |
| 3 | 0.010784 | 0.004276 | 0.845455 | 0.805958 | 0.816234 | 0.735028 | 0.830844 | 0.770493 |
| 4 | 0.009041 | 0.005557 | 0.758998 | 0.820940 | 0.713961 | 0.770292 | 0.736480 | 0.795616 |
| Mean | 0.019005 | 0.005444 | 0.789651 | 0.816623 | 0.727427 | 0.752712 | 0.758539 | 0.784667 |
| SD | 0.011429 | 0.000872 | 0.031537 | 0.005560 | 0.052099 | 0.013903 | 0.041056 | 0.009199 |
get_score(LogisticRegression(), X_selected_gb, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.020643 | 0.006713 | 0.780697 | 0.810285 | 0.692461 | 0.753923 | 0.736579 | 0.782104 |
| 1 | 0.022619 | 0.006969 | 0.837141 | 0.797507 | 0.756425 | 0.735760 | 0.796783 | 0.766633 |
| 2 | 0.020971 | 0.006092 | 0.776128 | 0.807554 | 0.704217 | 0.748321 | 0.740172 | 0.777937 |
| 3 | 0.016728 | 0.006002 | 0.726809 | 0.813948 | 0.666883 | 0.745572 | 0.696846 | 0.779760 |
| 4 | 0.018681 | 0.006489 | 0.776809 | 0.808434 | 0.735065 | 0.747995 | 0.755937 | 0.778214 |
| Mean | 0.019928 | 0.006453 | 0.779517 | 0.807546 | 0.711010 | 0.746314 | 0.745263 | 0.776930 |
| SD | 0.002031 | 0.000366 | 0.034982 | 0.005478 | 0.031555 | 0.005945 | 0.032292 | 0.005357 |
get_score(LogisticRegression(class_weight='balanced'), X, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.021826 | 0.004681 | 0.807348 | 0.814231 | 0.748049 | 0.745889 | 0.777698 | 0.780060 |
| 1 | 0.015269 | 0.004992 | 0.786693 | 0.815599 | 0.697601 | 0.759977 | 0.742147 | 0.787788 |
| 2 | 0.015915 | 0.005157 | 0.816010 | 0.809310 | 0.797164 | 0.727750 | 0.806587 | 0.768530 |
| 3 | 0.015541 | 0.005297 | 0.725974 | 0.832802 | 0.657468 | 0.765832 | 0.691721 | 0.799317 |
| 4 | 0.015296 | 0.005168 | 0.752134 | 0.825763 | 0.690260 | 0.757717 | 0.721197 | 0.791740 |
| Mean | 0.016769 | 0.005059 | 0.777632 | 0.819541 | 0.718108 | 0.751433 | 0.747870 | 0.785487 |
| SD | 0.002539 | 0.000212 | 0.033932 | 0.008520 | 0.049030 | 0.013501 | 0.040582 | 0.010514 |
get_score(LogisticRegression(class_weight='balanced'), X_selected20, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.023208 | 0.005601 | 0.779650 | 0.804260 | 0.713783 | 0.729776 | 0.746716 | 0.767018 |
| 1 | 0.063537 | 0.006523 | 0.774891 | 0.809991 | 0.700552 | 0.740240 | 0.737721 | 0.775116 |
| 2 | 0.022069 | 0.004304 | 0.738150 | 0.812440 | 0.668190 | 0.749060 | 0.703170 | 0.780750 |
| 3 | 0.011048 | 0.004224 | 0.825325 | 0.804681 | 0.735065 | 0.743940 | 0.780195 | 0.774311 |
| 4 | 0.010932 | 0.004289 | 0.746568 | 0.813310 | 0.677597 | 0.745155 | 0.712083 | 0.779233 |
| Mean | 0.026159 | 0.004988 | 0.772916 | 0.808936 | 0.699037 | 0.741634 | 0.735977 | 0.775285 |
| SD | 0.019405 | 0.000924 | 0.030656 | 0.003808 | 0.024203 | 0.006564 | 0.027279 | 0.004791 |
get_score(LogisticRegression(class_weight='balanced'), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.032847 | 0.006986 | 0.778508 | 0.795293 | 0.684514 | 0.724508 | 0.731511 | 0.759901 |
| 1 | 0.025122 | 0.007003 | 0.740815 | 0.806286 | 0.668190 | 0.747892 | 0.704502 | 0.777089 |
| 2 | 0.013877 | 0.004815 | 0.763373 | 0.796203 | 0.695698 | 0.728108 | 0.729536 | 0.762156 |
| 3 | 0.011135 | 0.004783 | 0.805380 | 0.785141 | 0.752273 | 0.709087 | 0.778827 | 0.747114 |
| 4 | 0.013056 | 0.005417 | 0.707236 | 0.806968 | 0.607468 | 0.739892 | 0.657352 | 0.773430 |
| Mean | 0.019207 | 0.005801 | 0.759062 | 0.797978 | 0.681628 | 0.729898 | 0.720345 | 0.763938 |
| SD | 0.008396 | 0.001001 | 0.033350 | 0.008062 | 0.046642 | 0.013334 | 0.039619 | 0.010635 |
get_score(LogisticRegression(class_weight='balanced'), X_selected35, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.018767 | 0.006876 | 0.775842 | 0.787876 | 0.697601 | 0.710779 | 0.736722 | 0.749328 |
| 1 | 0.067232 | 0.006953 | 0.762136 | 0.796338 | 0.676280 | 0.724080 | 0.719208 | 0.760209 |
| 2 | 0.030398 | 0.007094 | 0.768894 | 0.795211 | 0.712164 | 0.727703 | 0.740529 | 0.761457 |
| 3 | 0.016930 | 0.006435 | 0.782839 | 0.793380 | 0.745779 | 0.718006 | 0.764309 | 0.755693 |
| 4 | 0.022244 | 0.007031 | 0.747588 | 0.800089 | 0.652273 | 0.715958 | 0.699930 | 0.758023 |
| Mean | 0.031114 | 0.006878 | 0.767460 | 0.794579 | 0.696820 | 0.719305 | 0.732140 | 0.756942 |
| SD | 0.018640 | 0.000233 | 0.012100 | 0.004005 | 0.031747 | 0.005982 | 0.021591 | 0.004284 |
get_score(LogisticRegression(class_weight='balanced'), X_selected_rf, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.015619 | 0.005773 | 0.794213 | 0.791018 | 0.759376 | 0.713187 | 0.776794 | 0.752102 |
| 1 | 0.012865 | 0.004219 | 0.778793 | 0.794782 | 0.726870 | 0.732900 | 0.752832 | 0.763841 |
| 2 | 0.013844 | 0.004495 | 0.769465 | 0.797695 | 0.707596 | 0.732970 | 0.738530 | 0.765332 |
| 3 | 0.012649 | 0.004152 | 0.742393 | 0.801915 | 0.655519 | 0.739898 | 0.698956 | 0.770906 |
| 4 | 0.010918 | 0.004247 | 0.724026 | 0.804569 | 0.641234 | 0.751650 | 0.682630 | 0.778110 |
| Mean | 0.013179 | 0.004577 | 0.761778 | 0.797996 | 0.698119 | 0.734121 | 0.729949 | 0.766058 |
| SD | 0.001542 | 0.000609 | 0.025305 | 0.004851 | 0.044090 | 0.012504 | 0.034616 | 0.008590 |
get_score(LogisticRegression(class_weight='balanced'), X_selected35, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.042567 | 0.008436 | 0.767371 | 0.793455 | 0.597087 | 0.578201 | 0.682229 | 0.685828 |
| 1 | 0.043527 | 0.006755 | 0.797068 | 0.781211 | 0.572816 | 0.563209 | 0.684942 | 0.672210 |
| 2 | 0.026357 | 0.006003 | 0.701028 | 0.810619 | 0.564582 | 0.590357 | 0.632805 | 0.700488 |
| 3 | 0.024596 | 0.007722 | 0.778015 | 0.792039 | 0.563312 | 0.571660 | 0.670663 | 0.681849 |
| 4 | 0.030615 | 0.007542 | 0.770130 | 0.791507 | 0.564935 | 0.577328 | 0.667532 | 0.684417 |
| Mean | 0.033532 | 0.007292 | 0.762722 | 0.793766 | 0.572546 | 0.576151 | 0.667634 | 0.684959 |
| SD | 0.008017 | 0.000837 | 0.032546 | 0.009485 | 0.012720 | 0.008889 | 0.018627 | 0.009104 |
log_grid_search = {'penalty': [None, "l1", "l2"],
'C': list(np.logspace(-4,3,8)),
'class_weight': ["balanced"],
'max_iter': [2500],
'dual': [True, False],
'tol': [1e-6,1e-5,1e-4,1e-3]
}
log_model_grid = GridSearchCV(estimator=LogisticRegression(),
param_grid=log_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
log_model_grid.fit(X_selected30, y)
print(log_model_grid.best_params_)
get_score(log_model_grid.best_estimator_, X_selected30, y)
Fitting 5 folds for each of 192 candidates, totalling 960 fits
{'C': 1.0, 'class_weight': 'balanced', 'dual': False, 'max_iter': 2500, 'penalty': 'l2', 'tol': 1e-06}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.018268 | 0.009212 | 0.768799 | 0.796115 | 0.702313 | 0.711590 | 0.735556 | 0.753852 |
| 1 | 0.014232 | 0.005505 | 0.713021 | 0.806932 | 0.627594 | 0.745461 | 0.670307 | 0.776196 |
| 2 | 0.017872 | 0.006068 | 0.780506 | 0.794770 | 0.703931 | 0.732518 | 0.742219 | 0.763644 |
| 3 | 0.016928 | 0.006502 | 0.785807 | 0.793635 | 0.704545 | 0.721239 | 0.745176 | 0.757437 |
| 4 | 0.018694 | 0.004715 | 0.768738 | 0.797937 | 0.717532 | 0.732997 | 0.743135 | 0.765467 |
| Mean | 0.017199 | 0.006401 | 0.763374 | 0.797878 | 0.691183 | 0.728761 | 0.727279 | 0.763319 |
| SD | 0.001594 | 0.001528 | 0.026040 | 0.004749 | 0.032258 | 0.011510 | 0.028668 | 0.007678 |
get_score(LogisticRegression(C=100, class_weight='balanced', penalty='l2', tol=1e-6, max_iter=20000), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.169962 | 0.020047 | 0.764325 | 0.811530 | 0.680992 | 0.723769 | 0.722658 | 0.767649 |
| 1 | 0.091644 | 0.004399 | 0.730249 | 0.812082 | 0.653198 | 0.737427 | 0.691724 | 0.774755 |
| 2 | 0.053212 | 0.005024 | 0.770322 | 0.805540 | 0.722016 | 0.729347 | 0.746169 | 0.767444 |
| 3 | 0.048281 | 0.004117 | 0.737384 | 0.808907 | 0.660065 | 0.744762 | 0.698724 | 0.776834 |
| 4 | 0.054818 | 0.004255 | 0.798145 | 0.800473 | 0.704221 | 0.721650 | 0.751183 | 0.761061 |
| Mean | 0.083583 | 0.007568 | 0.760085 | 0.807706 | 0.684098 | 0.731391 | 0.722092 | 0.769549 |
| SD | 0.045874 | 0.006247 | 0.024400 | 0.004295 | 0.026018 | 0.008630 | 0.024061 | 0.005661 |
get_score(DecisionTreeClassifier(), X, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.043706 | 0.008076 | 0.598372 | 1.0 | 0.598372 | 1.0 | 0.598372 | 1.0 |
| 1 | 0.032171 | 0.006935 | 0.583952 | 1.0 | 0.583952 | 1.0 | 0.583952 | 1.0 |
| 2 | 0.037395 | 0.005528 | 0.557634 | 1.0 | 0.557634 | 1.0 | 0.557634 | 1.0 |
| 3 | 0.031088 | 0.004909 | 0.573377 | 1.0 | 0.573377 | 1.0 | 0.573377 | 1.0 |
| 4 | 0.059523 | 0.008391 | 0.508117 | 1.0 | 0.508117 | 1.0 | 0.508117 | 1.0 |
| Mean | 0.040777 | 0.006768 | 0.564290 | 1.0 | 0.564290 | 1.0 | 0.564290 | 1.0 |
| SD | 0.010386 | 0.001369 | 0.031082 | 0.0 | 0.031082 | 0.0 | 0.031082 | 0.0 |
get_score(DecisionTreeClassifier(), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.027298 | 0.007861 | 0.555873 | 1.0 | 0.555873 | 1.0 | 0.555873 | 1.0 |
| 1 | 0.020464 | 0.006879 | 0.547925 | 1.0 | 0.547925 | 1.0 | 0.547925 | 1.0 |
| 2 | 0.016663 | 0.005932 | 0.601751 | 1.0 | 0.601751 | 1.0 | 0.601751 | 1.0 |
| 3 | 0.014897 | 0.005834 | 0.582468 | 1.0 | 0.582468 | 1.0 | 0.582468 | 1.0 |
| 4 | 0.015205 | 0.006414 | 0.569156 | 1.0 | 0.569156 | 1.0 | 0.569156 | 1.0 |
| Mean | 0.018905 | 0.006584 | 0.571435 | 1.0 | 0.571435 | 1.0 | 0.571435 | 1.0 |
| SD | 0.004640 | 0.000739 | 0.019188 | 0.0 | 0.019188 | 0.0 | 0.019188 | 0.0 |
get_score(DecisionTreeClassifier(), X_selected30, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.014740 | 0.004771 | 0.539692 | 1.0 | 0.539692 | 1.0 | 0.539692 | 1.0 |
| 1 | 0.017605 | 0.005632 | 0.538073 | 1.0 | 0.538073 | 1.0 | 0.538073 | 1.0 |
| 2 | 0.015950 | 0.005952 | 0.552779 | 1.0 | 0.552779 | 1.0 | 0.552779 | 1.0 |
| 3 | 0.014085 | 0.005938 | 0.600325 | 1.0 | 0.600325 | 1.0 | 0.600325 | 1.0 |
| 4 | 0.013934 | 0.005551 | 0.559091 | 1.0 | 0.559091 | 1.0 | 0.559091 | 1.0 |
| Mean | 0.015263 | 0.005569 | 0.557992 | 1.0 | 0.557992 | 1.0 | 0.557992 | 1.0 |
| SD | 0.001370 | 0.000430 | 0.022592 | 0.0 | 0.022592 | 0.0 | 0.022592 | 0.0 |
dt = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=5, max_depth=5)
dt.fit(X_selected30, y)
plt.figure(figsize=(40,40))
plot_tree(dt)
plt.savefig("dt.png")
We can see which variables are more important in the tree learning method from the above figure.
Random Forest
Random Forest is a method that ensembles basic decision tree models.
This time, we repeated the same steps for Random Forest method.
get_score(RandomForestClassifier(), X, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.319953 | 0.029031 | 0.682895 | 1.0 | 0.529412 | 1.0 | 0.606154 | 1.0 |
| 1 | 0.325046 | 0.030169 | 0.797164 | 1.0 | 0.514706 | 1.0 | 0.655935 | 1.0 |
| 2 | 0.331128 | 0.046951 | 0.824910 | 1.0 | 0.500000 | 1.0 | 0.662455 | 1.0 |
| 3 | 0.310518 | 0.030483 | 0.751994 | 1.0 | 0.514286 | 1.0 | 0.633140 | 1.0 |
| 4 | 0.316531 | 0.029119 | 0.759740 | 1.0 | 0.500000 | 1.0 | 0.629870 | 1.0 |
| Mean | 0.320635 | 0.033151 | 0.763341 | 1.0 | 0.511681 | 1.0 | 0.637511 | 1.0 |
| SD | 0.007060 | 0.006924 | 0.048058 | 0.0 | 0.010984 | 0.0 | 0.020112 | 0.0 |
get_score(RandomForestClassifier(), X_selected20, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.292384 | 0.028698 | 0.727251 | 1.0 | 0.526176 | 1.0 | 0.626713 | 1.0 |
| 1 | 0.281991 | 0.028711 | 0.767942 | 1.0 | 0.542500 | 1.0 | 0.655221 | 1.0 |
| 2 | 0.289282 | 0.030254 | 0.805587 | 1.0 | 0.511470 | 1.0 | 0.658528 | 1.0 |
| 3 | 0.327876 | 0.030368 | 0.803015 | 1.0 | 0.525325 | 1.0 | 0.664170 | 1.0 |
| 4 | 0.280770 | 0.028935 | 0.810622 | 1.0 | 0.542857 | 1.0 | 0.676739 | 1.0 |
| Mean | 0.294461 | 0.029393 | 0.782883 | 1.0 | 0.529665 | 1.0 | 0.656274 | 1.0 |
| SD | 0.017266 | 0.000755 | 0.031649 | 0.0 | 0.011839 | 0.0 | 0.016500 | 0.0 |
get_score(RandomForestClassifier(), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.264792 | 0.028331 | 0.752427 | 1.0 | 0.513088 | 1.0 | 0.632757 | 1.0 |
| 1 | 0.282824 | 0.031027 | 0.743956 | 1.0 | 0.514706 | 1.0 | 0.629331 | 1.0 |
| 2 | 0.257291 | 0.030990 | 0.809014 | 1.0 | 0.544118 | 1.0 | 0.676566 | 1.0 |
| 3 | 0.321685 | 0.031830 | 0.789054 | 1.0 | 0.539610 | 1.0 | 0.664332 | 1.0 |
| 4 | 0.283617 | 0.028386 | 0.751299 | 1.0 | 0.526948 | 1.0 | 0.639123 | 1.0 |
| Mean | 0.282042 | 0.030113 | 0.769150 | 1.0 | 0.527694 | 1.0 | 0.648422 | 1.0 |
| SD | 0.022293 | 0.001464 | 0.025371 | 0.0 | 0.012604 | 0.0 | 0.018663 | 0.0 |
get_score(RandomForestClassifier(), X_selected35, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.240579 | 0.029954 | 0.760851 | 1.0 | 0.524557 | 1.00000 | 0.642704 | 1.000000 |
| 1 | 0.230034 | 0.029966 | 0.785789 | 1.0 | 0.527794 | 1.00000 | 0.656791 | 1.000000 |
| 2 | 0.237544 | 0.030092 | 0.760518 | 1.0 | 0.493528 | 1.00000 | 0.627023 | 1.000000 |
| 3 | 0.277579 | 0.048161 | 0.735343 | 1.0 | 0.511039 | 1.00000 | 0.623191 | 1.000000 |
| 4 | 0.229301 | 0.028764 | 0.782978 | 1.0 | 0.542857 | 0.99635 | 0.662917 | 0.998175 |
| Mean | 0.243007 | 0.033387 | 0.765096 | 1.0 | 0.519955 | 0.99927 | 0.642525 | 0.999635 |
| SD | 0.017816 | 0.007403 | 0.018288 | 0.0 | 0.016644 | 0.00146 | 0.015707 | 0.000730 |
get_score(RandomForestClassifier(), X_selected_rf, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.291745 | 0.028540 | 0.757805 | 1.0 | 0.529412 | 1.0 | 0.643608 | 1.0 |
| 1 | 0.285761 | 0.029537 | 0.747002 | 1.0 | 0.542500 | 1.0 | 0.644751 | 1.0 |
| 2 | 0.295477 | 0.033027 | 0.799543 | 1.0 | 0.496764 | 1.0 | 0.648153 | 1.0 |
| 3 | 0.323366 | 0.029363 | 0.815584 | 1.0 | 0.498377 | 1.0 | 0.656981 | 1.0 |
| 4 | 0.284330 | 0.029310 | 0.796104 | 1.0 | 0.528571 | 1.0 | 0.662338 | 1.0 |
| Mean | 0.296136 | 0.029955 | 0.783208 | 1.0 | 0.519125 | 1.0 | 0.651166 | 1.0 |
| SD | 0.014200 | 0.001574 | 0.026221 | 0.0 | 0.018286 | 0.0 | 0.007293 | 0.0 |
get_score(RandomForestClassifier(), X_selected_lr, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.263955 | 0.028495 | 0.745574 | 1.0 | 0.496764 | 1.0 | 0.621169 | 1.0 |
| 1 | 0.257052 | 0.029931 | 0.787883 | 1.0 | 0.527794 | 1.0 | 0.657838 | 1.0 |
| 2 | 0.262699 | 0.030726 | 0.791595 | 1.0 | 0.513088 | 1.0 | 0.652342 | 1.0 |
| 3 | 0.280024 | 0.030075 | 0.774861 | 1.0 | 0.514286 | 1.0 | 0.644573 | 1.0 |
| 4 | 0.248098 | 0.029528 | 0.735529 | 1.0 | 0.506169 | 1.0 | 0.620849 | 1.0 |
| Mean | 0.262366 | 0.029751 | 0.767088 | 1.0 | 0.511620 | 1.0 | 0.639354 | 1.0 |
| SD | 0.010452 | 0.000737 | 0.022593 | 0.0 | 0.010210 | 0.0 | 0.015561 | 0.0 |
get_score(RandomForestClassifier(), X_selected_gb, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.283158 | 0.030033 | 0.805682 | 1.0 | 0.513088 | 1.0 | 0.659385 | 1.0 |
| 1 | 0.266032 | 0.029483 | 0.748572 | 1.0 | 0.514706 | 1.0 | 0.631639 | 1.0 |
| 2 | 0.281929 | 0.031911 | 0.773701 | 1.0 | 0.513088 | 1.0 | 0.643394 | 1.0 |
| 3 | 0.309316 | 0.029970 | 0.815445 | 1.0 | 0.523701 | 1.0 | 0.669573 | 1.0 |
| 4 | 0.262655 | 0.027915 | 0.766048 | 1.0 | 0.511039 | 1.0 | 0.638544 | 1.0 |
| Mean | 0.280618 | 0.029862 | 0.781890 | 1.0 | 0.515124 | 1.0 | 0.648507 | 1.0 |
| SD | 0.016536 | 0.001279 | 0.024981 | 0.0 | 0.004444 | 0.0 | 0.013943 | 0.0 |
get_score(RandomForestClassifier(), X_selected30, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.251956 | 0.029300 | 0.789930 | 1.000000e+00 | 0.527794 | 1.0 | 0.658862 | 1.0 |
| 1 | 0.241212 | 0.029418 | 0.705597 | 1.000000e+00 | 0.514706 | 1.0 | 0.610151 | 1.0 |
| 2 | 0.245900 | 0.029197 | 0.766229 | 1.000000e+00 | 0.514706 | 1.0 | 0.640467 | 1.0 |
| 3 | 0.277318 | 0.030171 | 0.792115 | 1.000000e+00 | 0.514286 | 1.0 | 0.653200 | 1.0 |
| 4 | 0.240229 | 0.030080 | 0.694712 | 1.000000e+00 | 0.511039 | 1.0 | 0.602876 | 1.0 |
| Mean | 0.251323 | 0.029633 | 0.749717 | 1.000000e+00 | 0.516506 | 1.0 | 0.633111 | 1.0 |
| SD | 0.013644 | 0.000409 | 0.041616 | 7.021667e-17 | 0.005809 | 0.0 | 0.022637 | 0.0 |
Optimizations
rf_grid_search = {'oob_score': [True],
'max_depth': [15, 20, 25, 30, 35],
'max_features': ['sqrt', 'log2'],
'min_samples_leaf': [2, 5, 10],
'min_samples_split': [2, 3, 5],
'n_estimators': [150, 200, 250]
}
rf_model_grid = GridSearchCV(estimator=RandomForestClassifier(),
param_grid=rf_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
rf_model_grid.fit(X_selected30, y)
print(rf_model_grid.best_params_)
get_score(rf_model_grid.best_estimator_, X_selected30, y)
Fitting 5 folds for each of 270 candidates, totalling 1350 fits
{'max_depth': 25, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 250, 'oob_score': True}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.631820 | 0.066543 | 0.727013 | 1.0 | 0.511470 | 0.891304 | 0.619241 | 0.945652 |
| 1 | 0.668981 | 0.090793 | 0.762897 | 1.0 | 0.514706 | 0.865942 | 0.638802 | 0.932971 |
| 2 | 0.608214 | 0.060503 | 0.734723 | 1.0 | 0.514706 | 0.855072 | 0.624714 | 0.927536 |
| 3 | 0.616962 | 0.061023 | 0.810714 | 1.0 | 0.528571 | 0.861314 | 0.669643 | 0.930657 |
| 4 | 0.628187 | 0.063811 | 0.793599 | 1.0 | 0.512662 | 0.864964 | 0.653131 | 0.932482 |
| Mean | 0.630833 | 0.068535 | 0.765789 | 1.0 | 0.516423 | 0.867719 | 0.641106 | 0.933860 |
| SD | 0.020819 | 0.011337 | 0.032461 | 0.0 | 0.006199 | 0.012393 | 0.018499 | 0.006197 |
Gradient Boost
In Gradient Boosting, each predictor tries to improve on its predecessor by reducing the errors. instead of fitting a predictor on the data at each iteration, it actually fits a new predictor to the residual errors made by the previous predictor.
We followed same steps for GradientBoostingClassifier, XGBoost and Catboost.
The main difference in GradientBoost is the way we did the grid search, which is stepwise.
We tried the most important parameters first and searched the grid for a small number of parameters at a time.
At the end, we did grid search for all parameters but used a small range.
get_score(GradientBoostingClassifier(), X, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.835715 | 0.005850 | 0.821721 | 0.983857 | 0.511470 | 0.746377 | 0.666595 | 0.865117 |
| 1 | 0.861605 | 0.006338 | 0.749619 | 0.986200 | 0.565439 | 0.771334 | 0.657529 | 0.878767 |
| 2 | 0.860435 | 0.005920 | 0.764420 | 0.986605 | 0.567057 | 0.731479 | 0.665739 | 0.859042 |
| 3 | 0.859476 | 0.005969 | 0.795176 | 0.987949 | 0.515584 | 0.740876 | 0.655380 | 0.864412 |
| 4 | 0.858093 | 0.006461 | 0.687848 | 0.992588 | 0.493506 | 0.802110 | 0.590677 | 0.897349 |
| Mean | 0.855065 | 0.006108 | 0.763757 | 0.987440 | 0.530611 | 0.758435 | 0.647184 | 0.872938 |
| SD | 0.009743 | 0.000244 | 0.045408 | 0.002893 | 0.030034 | 0.025512 | 0.028595 | 0.013834 |
get_score(GradientBoostingClassifier(), X_selected20, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.602328 | 0.005612 | 0.743290 | 0.982101 | 0.536027 | 0.753623 | 0.639658 | 0.867862 |
| 1 | 0.617675 | 0.010857 | 0.794118 | 0.980328 | 0.560584 | 0.738725 | 0.677351 | 0.859527 |
| 2 | 0.601705 | 0.006202 | 0.799638 | 0.986136 | 0.498382 | 0.768116 | 0.649010 | 0.877126 |
| 3 | 0.593806 | 0.006320 | 0.804731 | 0.986146 | 0.542857 | 0.722628 | 0.673794 | 0.854387 |
| 4 | 0.582897 | 0.005587 | 0.692115 | 0.987801 | 0.517208 | 0.758314 | 0.604661 | 0.873058 |
| Mean | 0.599682 | 0.006916 | 0.766778 | 0.984502 | 0.531012 | 0.748281 | 0.648895 | 0.866392 |
| SD | 0.011411 | 0.001993 | 0.043345 | 0.002809 | 0.021426 | 0.015950 | 0.026350 | 0.008403 |
get_score(GradientBoostingClassifier(), X_selected30, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.433493 | 0.005555 | 0.736627 | 0.988778 | 0.526176 | 0.738725 | 0.631401 | 0.863752 |
| 1 | 0.418880 | 0.005390 | 0.811251 | 0.981585 | 0.558824 | 0.731479 | 0.685037 | 0.856532 |
| 2 | 0.488240 | 0.005700 | 0.795545 | 0.975748 | 0.539263 | 0.695247 | 0.667404 | 0.835497 |
| 3 | 0.453737 | 0.005258 | 0.739425 | 0.973078 | 0.499675 | 0.722223 | 0.619550 | 0.847651 |
| 4 | 0.453936 | 0.005931 | 0.748980 | 0.989775 | 0.571104 | 0.744526 | 0.660042 | 0.867150 |
| Mean | 0.449657 | 0.005567 | 0.766365 | 0.981793 | 0.539008 | 0.726440 | 0.652687 | 0.854116 |
| SD | 0.023376 | 0.000235 | 0.030915 | 0.006709 | 0.025038 | 0.017287 | 0.023951 | 0.011463 |
get_score(GradientBoostingClassifier(), X_selected35, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.405741 | 0.005887 | 0.743670 | 0.984838 | 0.490291 | 0.750000 | 0.616981 | 0.867419 |
| 1 | 0.398976 | 0.005591 | 0.786122 | 0.977838 | 0.563821 | 0.724232 | 0.674971 | 0.851035 |
| 2 | 0.435833 | 0.005925 | 0.745479 | 0.975425 | 0.524557 | 0.717391 | 0.635018 | 0.846408 |
| 3 | 0.389565 | 0.005134 | 0.785807 | 0.978584 | 0.523701 | 0.732767 | 0.654754 | 0.855675 |
| 4 | 0.381726 | 0.005254 | 0.786271 | 0.977027 | 0.506169 | 0.718168 | 0.646220 | 0.847597 |
| Mean | 0.402368 | 0.005558 | 0.769470 | 0.978742 | 0.521708 | 0.728512 | 0.645589 | 0.853627 |
| SD | 0.018616 | 0.000322 | 0.020336 | 0.003223 | 0.024552 | 0.012073 | 0.019380 | 0.007611 |
get_score(GradientBoostingClassifier(), X_selected_rf, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.577960 | 0.005519 | 0.756139 | 0.985959 | 0.521321 | 0.757246 | 0.638730 | 0.871603 |
| 1 | 0.586481 | 0.005803 | 0.738054 | 0.990176 | 0.581763 | 0.760870 | 0.659909 | 0.875523 |
| 2 | 0.575354 | 0.005417 | 0.810013 | 0.985308 | 0.552351 | 0.775362 | 0.681182 | 0.880335 |
| 3 | 0.572038 | 0.005332 | 0.772913 | 0.985325 | 0.491883 | 0.770073 | 0.632398 | 0.877699 |
| 4 | 0.600685 | 0.005609 | 0.781076 | 0.984255 | 0.536364 | 0.744526 | 0.658720 | 0.864390 |
| Mean | 0.582504 | 0.005536 | 0.771639 | 0.986204 | 0.536736 | 0.761615 | 0.654188 | 0.873910 |
| SD | 0.010273 | 0.000163 | 0.024213 | 0.002060 | 0.030069 | 0.010698 | 0.017296 | 0.005553 |
get_score(GradientBoostingClassifier(), X_selected_gb, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.519755 | 0.006119 | 0.796021 | 0.983452 | 0.518085 | 0.760464 | 0.657053 | 0.871958 |
| 1 | 0.503094 | 0.006214 | 0.704359 | 0.991080 | 0.552351 | 0.775362 | 0.628355 | 0.883221 |
| 2 | 0.582783 | 0.005408 | 0.812583 | 0.983475 | 0.549115 | 0.756841 | 0.680849 | 0.870158 |
| 3 | 0.479677 | 0.005378 | 0.838776 | 0.987183 | 0.555519 | 0.776967 | 0.697147 | 0.882075 |
| 4 | 0.485078 | 0.005331 | 0.738312 | 0.988037 | 0.553896 | 0.748175 | 0.646104 | 0.868106 |
| Mean | 0.514077 | 0.005690 | 0.778010 | 0.986646 | 0.545793 | 0.763562 | 0.661902 | 0.875104 |
| SD | 0.037142 | 0.000391 | 0.049422 | 0.002903 | 0.014015 | 0.011050 | 0.024494 | 0.006290 |
get_score(GradientBoostingClassifier(), X_selected_lr, y)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.447110 | 0.005576 | 0.815153 | 0.983646 | 0.539263 | 0.749595 | 0.677208 | 0.866620 |
| 1 | 0.430013 | 0.005175 | 0.748049 | 0.976029 | 0.514849 | 0.771334 | 0.631449 | 0.873682 |
| 2 | 0.489468 | 0.006072 | 0.795926 | 0.983082 | 0.563821 | 0.738725 | 0.679873 | 0.860904 |
| 3 | 0.443403 | 0.005975 | 0.740492 | 0.985094 | 0.496429 | 0.773723 | 0.618460 | 0.879408 |
| 4 | 0.454798 | 0.006128 | 0.784879 | 0.980715 | 0.540909 | 0.758314 | 0.662894 | 0.869514 |
| Mean | 0.452958 | 0.005785 | 0.776900 | 0.981713 | 0.531054 | 0.758338 | 0.653977 | 0.870026 |
| SD | 0.019942 | 0.000361 | 0.028450 | 0.003173 | 0.023237 | 0.013166 | 0.024734 | 0.006269 |
get_score(GradientBoostingClassifier(), X_selected30, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.462626 | 0.006231 | 0.764040 | 0.988032 | 0.682324 | 0.945729 | 0.723182 | 0.966880 |
| 1 | 0.461641 | 0.010050 | 0.775652 | 0.986981 | 0.672473 | 0.936814 | 0.724062 | 0.961898 |
| 2 | 0.440224 | 0.007345 | 0.732248 | 0.982495 | 0.607034 | 0.931976 | 0.669641 | 0.957235 |
| 3 | 0.424531 | 0.005330 | 0.819573 | 0.986483 | 0.739610 | 0.937241 | 0.779592 | 0.961862 |
| 4 | 0.459564 | 0.005857 | 0.796846 | 0.986176 | 0.684091 | 0.940075 | 0.740468 | 0.963125 |
| Mean | 0.449717 | 0.006963 | 0.777672 | 0.986033 | 0.677107 | 0.938367 | 0.727389 | 0.962200 |
| SD | 0.015035 | 0.001679 | 0.029586 | 0.001878 | 0.042224 | 0.004509 | 0.035383 | 0.003086 |
Feature selection seems to work well with 30 features.
We can continue with X_selected30.
Also, using sample weight when fitting our models work well.
gbc_grid_search = {'n_estimators': list(np.linspace(150, 250, 11, dtype=int))}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y, sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 11 candidates, totalling 55 fits
{'n_estimators': 150}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.653237 | 0.006349 | 0.759471 | 0.993693 | 0.605416 | 0.961126 | 0.682443 | 0.977409 |
| 1 | 0.661441 | 0.006284 | 0.759471 | 0.995332 | 0.649391 | 0.963128 | 0.704431 | 0.979230 |
| 2 | 0.689558 | 0.006076 | 0.731201 | 0.997275 | 0.618218 | 0.965559 | 0.674710 | 0.981417 |
| 3 | 0.634667 | 0.005713 | 0.750557 | 0.996495 | 0.641883 | 0.968421 | 0.696220 | 0.982458 |
| 4 | 0.638913 | 0.005816 | 0.769759 | 0.992441 | 0.602922 | 0.955865 | 0.686340 | 0.974153 |
| Mean | 0.655563 | 0.006048 | 0.754092 | 0.995047 | 0.623566 | 0.962820 | 0.688829 | 0.978933 |
| SD | 0.019542 | 0.000250 | 0.012960 | 0.001776 | 0.018903 | 0.004248 | 0.010434 | 0.002962 |
gbc_grid_search = {'max_depth':range(3,10,1), 'min_samples_split':[2,3,4,5,10]}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=230),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y, sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 35 candidates, totalling 175 fits
{'max_depth': 3, 'min_samples_split': 4}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.369174 | 0.007593 | 0.791262 | 0.999618 | 0.641015 | 0.981361 | 0.716138 | 0.990490 |
| 1 | 1.361363 | 0.007776 | 0.676756 | 0.999800 | 0.588806 | 0.985413 | 0.632781 | 0.992607 |
| 2 | 1.050994 | 0.007093 | 0.761374 | 0.999612 | 0.624691 | 0.982172 | 0.693033 | 0.990892 |
| 3 | 0.999658 | 0.006924 | 0.752597 | 0.999551 | 0.544805 | 0.984211 | 0.648701 | 0.991881 |
| 4 | 1.051374 | 0.007297 | 0.757328 | 0.999888 | 0.602273 | 0.984615 | 0.679801 | 0.992252 |
| Mean | 1.166513 | 0.007337 | 0.747864 | 0.999694 | 0.600318 | 0.983554 | 0.674091 | 0.991624 |
| SD | 0.163389 | 0.000313 | 0.038037 | 0.000128 | 0.033070 | 0.001532 | 0.030035 | 0.000806 |
gbc_grid_search = {'max_features':[None, "sqrt", "log2"]}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=240, max_depth=3, min_samples_split=3),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y, sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
{'max_features': 'log2'}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.265739 | 0.007006 | 0.761755 | 0.997175 | 0.637921 | 0.968801 | 0.699838 | 0.982988 |
| 1 | 0.274744 | 0.007125 | 0.777175 | 0.997287 | 0.621597 | 0.972042 | 0.699386 | 0.984665 |
| 2 | 0.263544 | 0.007032 | 0.713307 | 0.998456 | 0.608367 | 0.976499 | 0.660837 | 0.987477 |
| 3 | 0.265221 | 0.007062 | 0.770315 | 0.998434 | 0.624026 | 0.976923 | 0.697171 | 0.987678 |
| 4 | 0.283700 | 0.009984 | 0.801299 | 0.997955 | 0.603571 | 0.973684 | 0.702435 | 0.985820 |
| Mean | 0.270589 | 0.007642 | 0.764770 | 0.997861 | 0.619096 | 0.973590 | 0.691933 | 0.985726 |
| SD | 0.007631 | 0.001172 | 0.028902 | 0.000546 | 0.012174 | 0.002997 | 0.015638 | 0.001761 |
gbc_grid_search = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9, 0.95, 1]}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=230, max_depth=3, min_samples_split=3, max_features='sqrt'),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y,sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'subsample': 1}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.550220 | 0.014804 | 0.822006 | 0.998573 | 0.636018 | 0.971232 | 0.729012 | 0.984902 |
| 1 | 0.639400 | 0.012387 | 0.737864 | 0.998227 | 0.642918 | 0.974878 | 0.690391 | 0.986553 |
| 2 | 0.502251 | 0.008917 | 0.726632 | 0.998949 | 0.559395 | 0.976499 | 0.643014 | 0.987724 |
| 3 | 0.478468 | 0.015583 | 0.786735 | 0.998073 | 0.706494 | 0.973279 | 0.746614 | 0.985676 |
| 4 | 0.511250 | 0.013659 | 0.773191 | 0.998434 | 0.590909 | 0.971660 | 0.682050 | 0.985047 |
| Mean | 0.536318 | 0.013070 | 0.769286 | 0.998451 | 0.627147 | 0.973510 | 0.698216 | 0.985980 |
| SD | 0.056488 | 0.002340 | 0.034368 | 0.000302 | 0.050054 | 0.001974 | 0.036498 | 0.001048 |
gbc_grid_search = {'learning_rate':[0.001, 0.01,0.05,0.1,0.2]}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(subsample=0.95, n_estimators=230, max_depth=3, min_samples_split=3, max_features='sqrt'),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y,sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 5 candidates, totalling 25 fits
{'learning_rate': 0.001}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.447520 | 0.009740 | 0.740815 | 0.882930 | 0.676423 | 0.810796 | 0.708619 | 0.846863 |
| 1 | 0.414526 | 0.010170 | 0.755854 | 0.877481 | 0.675995 | 0.793513 | 0.715924 | 0.835497 |
| 2 | 0.377449 | 0.014536 | 0.755663 | 0.876371 | 0.658053 | 0.796279 | 0.706858 | 0.836325 |
| 3 | 0.522452 | 0.008937 | 0.870408 | 0.866911 | 0.783117 | 0.792987 | 0.826763 | 0.829949 |
| 4 | 0.353696 | 0.008041 | 0.794527 | 0.870960 | 0.721104 | 0.800674 | 0.757815 | 0.835817 |
| Mean | 0.423129 | 0.010285 | 0.783453 | 0.874931 | 0.702938 | 0.798850 | 0.743196 | 0.836890 |
| SD | 0.059062 | 0.002247 | 0.046980 | 0.005526 | 0.045172 | 0.006565 | 0.045732 | 0.005496 |
get_score(GradientBoostingClassifier(learning_rate=0.05, n_estimators=230, max_depth=3, min_samples_split=3, max_features='sqrt', subsample=0.95),
X_selected30, y, sample_weight=True)
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.331406 | 0.007875 | 0.795260 | 0.981614 | 0.703360 | 0.935622 | 0.749310 | 0.958618 |
| 1 | 0.307916 | 0.007284 | 0.729393 | 0.985912 | 0.636446 | 0.934002 | 0.682919 | 0.959957 |
| 2 | 0.345461 | 0.008543 | 0.763849 | 0.980710 | 0.623501 | 0.935217 | 0.693675 | 0.957963 |
| 3 | 0.323301 | 0.007818 | 0.819295 | 0.981388 | 0.664935 | 0.937241 | 0.742115 | 0.959315 |
| 4 | 0.317177 | 0.007163 | 0.757885 | 0.985325 | 0.611039 | 0.942909 | 0.684462 | 0.964117 |
| Mean | 0.325052 | 0.007737 | 0.773136 | 0.982990 | 0.647856 | 0.936998 | 0.710496 | 0.959994 |
| SD | 0.012772 | 0.000492 | 0.031153 | 0.002175 | 0.033019 | 0.003132 | 0.029077 | 0.002167 |
gbc_grid_search = {'learning_rate':[0.001, 0.01,0.05,0.1,0.2],
'subsample':[0.9,0.95,1],
'n_estimators':[150,200,250],
'max_depth':[3,5,7],
'min_samples_split': [2,3,5],
'max_features':['sqrt',None]}
gbc_model_grid = GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid=gbc_grid_search,
scoring = make_scorer(custom_score,needs_proba=True),
cv=StratifiedKFold(n_splits=5, shuffle=True), verbose=5, n_jobs=-1)
gbc_model_grid.fit(X_selected30, y,sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
print(gbc_model_grid.best_params_)
get_score(gbc_model_grid.best_estimator_, X_selected30, y, sample_weight=True)
Fitting 5 folds for each of 810 candidates, totalling 4050 fits
{'learning_rate': 0.01, 'max_depth': 3, 'max_features': None, 'min_samples_split': 3, 'n_estimators': 200, 'subsample': 0.9}
| fit_time | score_time | test_roc_auc | train_roc_auc | test_balanced_accuracy | train_balanced_accuracy | test ort | train ort | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.925227 | 0.009694 | 0.824101 | 0.903827 | 0.757757 | 0.830198 | 0.790929 | 0.867012 |
| 1 | 0.980944 | 0.007608 | 0.820198 | 0.906138 | 0.736579 | 0.839041 | 0.778389 | 0.872589 |
| 2 | 0.901992 | 0.007685 | 0.824481 | 0.912327 | 0.768942 | 0.831842 | 0.796711 | 0.872084 |
| 3 | 1.014597 | 0.007702 | 0.678015 | 0.922255 | 0.538636 | 0.868779 | 0.608326 | 0.895517 |
| 4 | 0.863074 | 0.008053 | 0.784694 | 0.907409 | 0.717857 | 0.828644 | 0.751276 | 0.868027 |
| Mean | 0.937167 | 0.008148 | 0.786298 | 0.910391 | 0.703954 | 0.839701 | 0.745126 | 0.875046 |
| SD | 0.054372 | 0.000788 | 0.056150 | 0.006551 | 0.084505 | 0.014968 | 0.070168 | 0.010466 |
Submisson
According to used ml models gradient boosting gives best score.
test_f=test.copy()
test_f=pd.get_dummies(test_f, columns=['Var_39','Var_53'])
test_f.columns
Index(['loan_application_id', 'loan_amount', 'default', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59',
'Var_39_N', 'Var_39_Y', 'Var_53_N', 'Var_53_Y'],
dtype='object')
X_test=test_f[['loan_amount', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59']].copy()
scaler = MinMaxScaler()
X_scaled_test = X_test[:]
X_scaled_test = scaler.fit_transform(X_test)
X_test[['loan_amount', 'customer_age',
'Var_1', 'Var_2', 'Var_3', 'Var_4', 'Var_5', 'Var_6', 'Var_7', 'Var_8',
'Var_9', 'Var_10', 'Var_11', 'Var_12', 'Var_13', 'Var_14', 'Var_15',
'Var_16', 'Var_17', 'Var_18', 'Var_19', 'Var_20', 'Var_21', 'Var_22',
'Var_23', 'Var_24', 'Var_25', 'Var_26', 'Var_27', 'Var_28', 'Var_29',
'Var_30', 'Var_31', 'Var_32', 'Var_33', 'Var_34', 'Var_35', 'Var_36',
'Var_37', 'Var_38', 'Var_40', 'Var_41', 'Var_42', 'Var_43', 'Var_44',
'Var_45', 'Var_46', 'Var_47', 'Var_48', 'Var_49', 'Var_50', 'Var_51',
'Var_52', 'Var_54', 'Var_55', 'Var_56', 'Var_57', 'Var_58', 'Var_59']]=X_scaled_test.copy()
X_scaled_test=X_test.copy()
n1 = 30 # The number of features to drop
features_to_drop1 = list(featureScores["Specs"].iloc[-n1:].values)
X_selected30_test = X_test.drop(features_to_drop1,1)
BEST_GBC = GradientBoostingClassifier(learning_rate=0.05,
subsample=0.9,
n_estimators=150,
max_depth=5,
min_samples_split=5,
max_features=None
)
BEST_GBC.fit(X_selected30, y, sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=y))
gbc_pred = BEST_GBC.predict_proba(X_selected30_test)[:,1]
np.savetxt('Predictions.csv',gbc_pred,fmt='%.2f',delimiter=',',newline=',')
Many ml models can be developed for such dataset.But instead of trying all of them, I tried the models that I thought would give good results.While trying the models, I also applied optimization and feature engineering.